# Packages used for Data pre-processing, unsupervised classifications, and plotting
# Basic libraries for data pre-processing/handling
import numpy as np
import pandas as pd
# Libraries for plotting
import plotly.express as px
import matplotlib.pyplot as plt
# Library used to standardize data
from sklearn.preprocessing import MinMaxScaler
# Libraries used for unsupervised classification and clustering
# Hierarchical clustering
from sklearn.cluster import AgglomerativeClustering
# K-means classificatoin
from sklearn.cluster import KMeans
# PCA clustering
from sklearn.decomposition import PCA
# Load in the PBTA dataset
df = pd.read_csv("pbta_mirna_tdata.csv")
df.head()
| class | let-7a-2-3p | let-7a-3p | let-7a-5p | let-7b-5p | let-7c-3p | let-7c-5p | let-7d-3p | let-7d-5p | let-7e-3p | ... | miR-944 | miR-95-3p | miR-95-5p | miR-9-5p | miR-96-3p | miR-96-5p | miR-98-3p | miR-99a-5p | miR-99b-3p | miR-99b-5p | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ATRT | 54 | 142 | 52379 | 21782 | 216 | 35044 | 1729 | 19661 | 296 | ... | 2 | 165 | 7 | 25001 | 14 | 1853 | 10 | 35498 | 422 | 10995 |
| 1 | ATRT | 136 | 109 | 39346 | 10494 | 38 | 16308 | 2132 | 20500 | 245 | ... | 34 | 51 | 35 | 1840 | 46 | 144 | 20 | 9680 | 443 | 7100 |
| 2 | ATRT | 13 | 74 | 39079 | 16633 | 159 | 30219 | 1614 | 13793 | 221 | ... | 3 | 27 | 6 | 5624 | 1 | 54 | 3 | 36509 | 312 | 7834 |
| 3 | ATRT | 28 | 64 | 19514 | 5273 | 38 | 9328 | 1003 | 10016 | 154 | ... | 2 | 307 | 3 | 4800 | 6 | 3818 | 3 | 9431 | 431 | 9542 |
| 4 | ATRT | 144 | 92 | 69964 | 20807 | 245 | 43025 | 1101 | 16204 | 291 | ... | 1 | 1907 | 0 | 94839 | 0 | 293 | 8 | 69583 | 375 | 11645 |
5 rows × 2084 columns
# Removing class labels to standardize data and perform unsupervised classifications
cancer_class = df["class"]
class_labels = cancer_class.to_list()
class_labels_num = [np.where(np.array(list(dict.fromkeys(class_labels)))==e)[0][0]for e in class_labels]
df = df.drop(["class"], axis=1)
df.head()
| let-7a-2-3p | let-7a-3p | let-7a-5p | let-7b-5p | let-7c-3p | let-7c-5p | let-7d-3p | let-7d-5p | let-7e-3p | let-7e-5p | ... | miR-944 | miR-95-3p | miR-95-5p | miR-9-5p | miR-96-3p | miR-96-5p | miR-98-3p | miR-99a-5p | miR-99b-3p | miR-99b-5p | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 54 | 142 | 52379 | 21782 | 216 | 35044 | 1729 | 19661 | 296 | 11693 | ... | 2 | 165 | 7 | 25001 | 14 | 1853 | 10 | 35498 | 422 | 10995 |
| 1 | 136 | 109 | 39346 | 10494 | 38 | 16308 | 2132 | 20500 | 245 | 9096 | ... | 34 | 51 | 35 | 1840 | 46 | 144 | 20 | 9680 | 443 | 7100 |
| 2 | 13 | 74 | 39079 | 16633 | 159 | 30219 | 1614 | 13793 | 221 | 8330 | ... | 3 | 27 | 6 | 5624 | 1 | 54 | 3 | 36509 | 312 | 7834 |
| 3 | 28 | 64 | 19514 | 5273 | 38 | 9328 | 1003 | 10016 | 154 | 4879 | ... | 2 | 307 | 3 | 4800 | 6 | 3818 | 3 | 9431 | 431 | 9542 |
| 4 | 144 | 92 | 69964 | 20807 | 245 | 43025 | 1101 | 16204 | 291 | 14064 | ... | 1 | 1907 | 0 | 94839 | 0 | 293 | 8 | 69583 | 375 | 11645 |
5 rows × 2083 columns
# Standardize the data
scaler = MinMaxScaler()
df[df.columns] = scaler.fit_transform(df)
df.head()
| let-7a-2-3p | let-7a-3p | let-7a-5p | let-7b-5p | let-7c-3p | let-7c-5p | let-7d-3p | let-7d-5p | let-7e-3p | let-7e-5p | ... | miR-944 | miR-95-3p | miR-95-5p | miR-9-5p | miR-96-3p | miR-96-5p | miR-98-3p | miR-99a-5p | miR-99b-3p | miR-99b-5p | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.154286 | 0.338902 | 0.143289 | 0.063826 | 0.302521 | 0.139854 | 0.297516 | 0.225642 | 0.139480 | 0.281512 | ... | 0.019417 | 0.036400 | 0.118644 | 0.038400 | 0.026718 | 0.005520 | 0.222222 | 0.131925 | 0.053069 | 0.336820 |
| 1 | 0.388571 | 0.260143 | 0.107580 | 0.030637 | 0.053221 | 0.064993 | 0.367023 | 0.235279 | 0.115366 | 0.218898 | ... | 0.330097 | 0.011251 | 0.593220 | 0.002809 | 0.087786 | 0.000429 | 0.444444 | 0.035788 | 0.055717 | 0.217349 |
| 2 | 0.037143 | 0.176611 | 0.106849 | 0.048687 | 0.222689 | 0.120575 | 0.277682 | 0.158235 | 0.104019 | 0.200429 | ... | 0.029126 | 0.005956 | 0.101695 | 0.008624 | 0.001908 | 0.000161 | 0.066667 | 0.135689 | 0.039203 | 0.239863 |
| 3 | 0.080000 | 0.152745 | 0.053244 | 0.015286 | 0.053221 | 0.037103 | 0.172301 | 0.114848 | 0.072340 | 0.117224 | ... | 0.019417 | 0.067726 | 0.050847 | 0.007358 | 0.011450 | 0.011373 | 0.066667 | 0.034861 | 0.054204 | 0.292252 |
| 4 | 0.411429 | 0.219570 | 0.191469 | 0.060959 | 0.343137 | 0.171743 | 0.189203 | 0.185931 | 0.137116 | 0.338678 | ... | 0.009709 | 0.420693 | 0.000000 | 0.145717 | 0.000000 | 0.000873 | 0.177778 | 0.258845 | 0.047145 | 0.356757 |
5 rows × 2083 columns
# Hierarchical Clustering
hclust = AgglomerativeClustering().fit(df)
hclust.labels_
array([1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
Notes: Hierarchical clustering only classified tumor subtypes as binary
# K-means clustering
kmeans = KMeans(n_clusters=10, random_state=0).fit(df)
kmeans.labels_
array([5, 3, 2, 0, 1, 2, 0, 1, 7, 0, 0, 1, 5, 1, 1, 2, 5, 5, 2, 1, 2, 2,
2, 2, 1, 2, 2, 2, 2, 5, 2, 2, 2, 2, 1, 1, 1, 2, 1, 1, 3, 1, 1, 1,
1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1, 2, 1, 5, 1,
2, 1, 1, 4, 2, 1, 1, 1, 1, 1, 6, 1, 5, 1, 1, 1, 9, 9, 9, 9, 9, 9,
9, 9, 8, 1, 1, 2, 9, 5, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 9, 9, 5, 9, 9, 5, 2, 1, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2,
2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2,
2, 2, 2, 1, 2, 1, 1, 1, 2, 1, 1, 1, 3, 1, 5, 1, 1, 1, 1, 1, 2, 1,
1, 2, 2, 1, 0, 1, 9, 9, 9, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,
0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 2, 1, 2],
dtype=int32)
Notes: Unsupervised classification of brain tumor samples
# Initialize PCA object
pca = PCA(n_components=4) # Looking at only the first four principal components
# Perform PCA on PBTA dataset
df_comps = pca.fit_transform(df)
# Display PCA plot based on K-means Classification
fig = px.scatter(df_comps, x=0, y=1, color=kmeans.labels_)
fig.update_layout(title="PCA Plot based on K-means Predicted Tumor Labels", xaxis_title="PC1", yaxis_title="PC2")
fig.show()
# Display PCA plot based on true tumor classification
fig = px.scatter(df_comps, x=0, y=1, color=class_labels_num)
fig.update_layout(title="PCA Plot based on True Tumor Labels", xaxis_title="PC1", yaxis_title="PC2")
fig.show()
# Make 3D plot of first three components based on K-means Classification
fig = px.scatter_3d(df_comps, x=0, y=1, z=2, color=kmeans.labels_)
fig.update_layout(title="3D PCA Plot based on K-means Predicted Tumor Labels")
fig.show()
# Make 3D plot of first three components based on true tumor classification
fig = px.scatter_3d(df_comps, x=0, y=1, z=2, color=class_labels_num)
fig.update_layout(title="3D PCA Plot based on True Tumor Labels")
fig.show()
Notes:
# Feature Selection using Variance Threshold library
from sklearn.feature_selection import VarianceThreshold
# Create function to produce filtered PBTA dataset based on selected Variance Threshold value
def variance_threshold_selector(data, threshold=0.5):
selector = VarianceThreshold(threshold)
selector.fit(data)
return data[data.columns[selector.get_support(indices=True)]]
df_filtered = variance_threshold_selector(df, 0.02)
df_filtered
| let-7a-5p | let-7c-3p | let-7c-5p | let-7d-3p | let-7i-5p | miR-100-5p | miR-101-5p | miR-106a-5p | miR-1227-3p | miR-1244 | ... | miR-887-5p | miR-889-5p | miR-92a-1-5p | miR-92a-3p | miR-93-5p | miR-9-3p | miR-944 | miR-9-5p | miR-99a-5p | miR-99b-5p | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.143289 | 0.302521 | 0.139854 | 0.297516 | 0.169913 | 0.162352 | 0.028509 | 0.088332 | 0.432432 | 0.285301 | ... | 0.165049 | 0.333333 | 0.121053 | 0.096909 | 0.096866 | 0.009828 | 0.019417 | 0.038400 | 0.131925 | 0.336820 |
| 1 | 0.107580 | 0.053221 | 0.064993 | 0.367023 | 0.277541 | 0.085068 | 0.242873 | 0.076490 | 0.554054 | 0.079411 | ... | 0.330097 | 0.714286 | 0.152632 | 0.111613 | 0.085791 | 0.001724 | 0.330097 | 0.002809 | 0.035788 | 0.217349 |
| 2 | 0.106849 | 0.222689 | 0.120575 | 0.277682 | 0.128592 | 0.164702 | 0.168860 | 0.729255 | 0.209459 | 0.185246 | ... | 0.155340 | 0.000000 | 0.100000 | 0.334132 | 0.119272 | 0.001652 | 0.029126 | 0.008624 | 0.135689 | 0.239863 |
| 3 | 0.053244 | 0.053221 | 0.037103 | 0.172301 | 0.089621 | 0.056314 | 0.021930 | 0.423439 | 0.351351 | 0.242224 | ... | 0.000000 | 0.000000 | 0.631579 | 0.283130 | 0.287190 | 0.002740 | 0.019417 | 0.007358 | 0.034861 | 0.292252 |
| 4 | 0.191469 | 0.343137 | 0.171743 | 0.189203 | 0.106273 | 0.395581 | 0.071820 | 0.121239 | 0.371622 | 0.233141 | ... | 0.038835 | 0.023810 | 0.152632 | 0.172057 | 0.140428 | 0.056377 | 0.009709 | 0.145717 | 0.258845 | 0.356757 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 259 | 0.573254 | 0.147059 | 0.336218 | 0.123491 | 0.313269 | 0.194475 | 0.625548 | 0.246264 | 0.324324 | 0.778833 | ... | 0.067961 | 0.071429 | 0.247368 | 0.259524 | 0.587547 | 0.192195 | 0.038835 | 0.337382 | 0.223593 | 0.278480 |
| 260 | 0.572942 | 0.247899 | 0.512316 | 0.083994 | 0.195404 | 0.456029 | 0.690241 | 0.168095 | 0.189189 | 0.354803 | ... | 0.048544 | 0.047619 | 0.126316 | 0.132789 | 0.561155 | 0.076084 | 0.000000 | 0.226200 | 0.569656 | 0.229188 |
| 261 | 0.125296 | 0.047619 | 0.103858 | 0.131425 | 0.221471 | 0.088010 | 0.071272 | 0.070227 | 0.054054 | 0.030829 | ... | 0.000000 | 0.000000 | 0.042105 | 0.044081 | 0.024489 | 0.002072 | 0.000000 | 0.008190 | 0.066024 | 0.075977 |
| 262 | 0.210804 | 0.035014 | 0.202541 | 0.554502 | 0.293949 | 0.402309 | 0.218202 | 0.430442 | 0.067568 | 0.075420 | ... | 0.300971 | 0.142857 | 0.145614 | 0.259640 | 0.083926 | 0.002292 | 0.009709 | 0.001466 | 0.507576 | 0.370744 |
| 263 | 0.065880 | 0.026611 | 0.054980 | 0.089514 | 0.094537 | 0.057825 | 0.050439 | 0.033847 | 0.033784 | 0.050096 | ... | 0.009709 | 0.000000 | 0.012281 | 0.051658 | 0.031531 | 0.012591 | 0.000000 | 0.018062 | 0.044981 | 0.047880 |
264 rows × 194 columns
# Perform PCA on filtered dataset
df_filtered_comps = pca.fit_transform(df_filtered)
# Display a 3D PCA plot of the PBTA dataset using only 194 miRNA genes that account for the greatest variance
fig = px.scatter_3d(df_filtered_comps, x=0, y=1, z=2, color=class_labels_num) # Using three principal components
fig.show()
# Libraries used for Machine Learning Classification
# Library used to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Library for measuring accuracy of ML models
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
# Machine Learning model used
from sklearn.ensemble import RandomForestClassifier
ml_df = df.copy()
ml_df["class"] = cancer_class
ml_df
| let-7a-2-3p | let-7a-3p | let-7a-5p | let-7b-5p | let-7c-3p | let-7c-5p | let-7d-3p | let-7d-5p | let-7e-3p | let-7e-5p | ... | miR-95-3p | miR-95-5p | miR-9-5p | miR-96-3p | miR-96-5p | miR-98-3p | miR-99a-5p | miR-99b-3p | miR-99b-5p | class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.154286 | 0.338902 | 0.143289 | 0.063826 | 0.302521 | 0.139854 | 0.297516 | 0.225642 | 0.139480 | 0.281512 | ... | 0.036400 | 0.118644 | 0.038400 | 0.026718 | 0.005520 | 0.222222 | 0.131925 | 0.053069 | 0.336820 | ATRT |
| 1 | 0.388571 | 0.260143 | 0.107580 | 0.030637 | 0.053221 | 0.064993 | 0.367023 | 0.235279 | 0.115366 | 0.218898 | ... | 0.011251 | 0.593220 | 0.002809 | 0.087786 | 0.000429 | 0.444444 | 0.035788 | 0.055717 | 0.217349 | ATRT |
| 2 | 0.037143 | 0.176611 | 0.106849 | 0.048687 | 0.222689 | 0.120575 | 0.277682 | 0.158235 | 0.104019 | 0.200429 | ... | 0.005956 | 0.101695 | 0.008624 | 0.001908 | 0.000161 | 0.066667 | 0.135689 | 0.039203 | 0.239863 | ATRT |
| 3 | 0.080000 | 0.152745 | 0.053244 | 0.015286 | 0.053221 | 0.037103 | 0.172301 | 0.114848 | 0.072340 | 0.117224 | ... | 0.067726 | 0.050847 | 0.007358 | 0.011450 | 0.011373 | 0.066667 | 0.034861 | 0.054204 | 0.292252 | ATRT |
| 4 | 0.411429 | 0.219570 | 0.191469 | 0.060959 | 0.343137 | 0.171743 | 0.189203 | 0.185931 | 0.137116 | 0.338678 | ... | 0.420693 | 0.000000 | 0.145717 | 0.000000 | 0.000873 | 0.177778 | 0.258845 | 0.047145 | 0.356757 | ATRT |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 259 | 0.008571 | 0.121718 | 0.573254 | 0.176071 | 0.147059 | 0.336218 | 0.123491 | 0.158419 | 0.088416 | 0.248770 | ... | 0.019854 | 0.000000 | 0.337382 | 0.137405 | 0.177326 | 0.044444 | 0.223593 | 0.040590 | 0.278480 | Medulloblastoma |
| 260 | 0.008571 | 0.119332 | 0.572942 | 0.174848 | 0.247899 | 0.512316 | 0.083994 | 0.133262 | 0.071868 | 0.220127 | ... | 0.035297 | 0.000000 | 0.226200 | 0.015267 | 0.018591 | 0.022222 | 0.569656 | 0.026976 | 0.229188 | Medulloblastoma |
| 261 | 0.042857 | 0.054893 | 0.125296 | 0.058948 | 0.047619 | 0.103858 | 0.131425 | 0.100466 | 0.022222 | 0.105193 | ... | 0.018751 | 0.000000 | 0.008190 | 0.000000 | 0.000098 | 0.000000 | 0.066024 | 0.010967 | 0.075977 | Schwannoma |
| 262 | 0.217143 | 0.033413 | 0.210804 | 0.263921 | 0.035014 | 0.202541 | 0.554502 | 0.295299 | 0.104965 | 0.241200 | ... | 0.012795 | 0.016949 | 0.001466 | 0.000000 | 0.000018 | 0.066667 | 0.507576 | 0.032648 | 0.370744 | Schwannoma |
| 263 | 0.008571 | 0.019093 | 0.065880 | 0.029196 | 0.026611 | 0.054980 | 0.089514 | 0.047384 | 0.013239 | 0.058612 | ... | 0.015222 | 0.000000 | 0.018062 | 0.000000 | 0.000438 | 0.044444 | 0.044981 | 0.005546 | 0.047880 | Teratoma |
264 rows × 2084 columns
# Split the dataset into features and labels
X = ml_df.loc[:, ml_df.columns != 'class'].values
y = ml_df.loc[:, ml_df.columns == 'class'].values.ravel()
# Sanity check
# print(X[:5])
# print(y[:5])
# Split PBTA data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
#Sanity check
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
(198, 2083) (66, 2083) (198,) (66,)
# Initialize random forest classifier
rfc = RandomForestClassifier(max_depth=2, random_state=0)
# Train the random forest classifier
rfc.fit(X_train, y_train)
# Make predictions using random forest classifier
rfc_y_pred = rfc.predict(X_test)
# Accuracy of model
print(f'Accuracy: {accuracy_score(y_test, rfc_y_pred)}')
Accuracy: 0.5757575757575758
# Calculate a confusion matrix
cm = confusion_matrix(y_test, rfc_y_pred, labels=rfc.classes_)
# Display confusion matrix to look at how accurately the ML model was able to classify each tumor type
disp = px.imshow(cm, text_auto=True,
labels=dict(x="Tumor Type", y="Tumor Type", color="Productivity"),
x=cancer_class.unique().tolist(),
y=cancer_class.unique().tolist()
)
disp.show()
Notes:
# What are the most important features?
feature_list = ml_df.columns
feature_list = feature_list.drop('class')
imp_features = pd.Series(rfc.feature_importances_, index=feature_list)
imp_genes = imp_features.sort_values(ascending=False).to_frame().reset_index()
imp_genes.columns = ["features", "importance"]
imp_genes_fil = imp_genes[~(imp_genes == 0.000000).any(axis=1)]
imp_genes_fil
| features | importance | |
|---|---|---|
| 0 | miR-106b-5p | 0.030923 |
| 1 | miR-93-5p | 0.021131 |
| 2 | miR-106b-3p | 0.019165 |
| 3 | miR-34b-5p | 0.017196 |
| 4 | miR-345-5p | 0.016621 |
| ... | ... | ... |
| 204 | miR-301a-5p | 0.000539 |
| 205 | miR-4999-5p | 0.000534 |
| 206 | miR-758-3p | 0.000491 |
| 207 | miR-224-3p | 0.000457 |
| 208 | miR-493-3p | 0.000446 |
209 rows × 2 columns
# Display interactive Barplot of important miRNAs
fig = px.bar(imp_genes_fil, x=imp_genes_fil.features, y=imp_genes_fil.importance)
fig.show()
# imp_genes_fil_list = imp_genes_fil.index
# imp_genes_fil_list
# textfile = open("imp_genes.txt", "w")
# for element in imp_genes_fil_list:
# textfile.write(element + "\n")
# textfile.close()
fig = px.histogram(ml_df, x="class", nbins=10)
fig.show()
# Library for UMAP projections
from umap.umap_ import UMAP
umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)
proj_2d = umap_2d.fit_transform(df)
proj_3d = umap_3d.fit_transform(df)
fig_2d = px.scatter(proj_2d, x=0, y=1, color=class_labels_num)
fig_3d = px.scatter_3d(proj_3d, x=0, y=1, z=2, color=class_labels_num)
fig_2d.show()
fig_3d.show()
umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)
proj_2d = umap_2d.fit_transform(df_filtered)
proj_3d = umap_3d.fit_transform(df_filtered)
fig_2d = px.scatter(proj_2d, x=0, y=1, color=class_labels_num)
fig_3d = px.scatter_3d(proj_3d, x=0, y=1, z=2, color=class_labels_num)
fig_2d.show()
fig_3d.show()
# Make a list of genes that ML model found important
ml_fil_genes = imp_genes_fil['features'].tolist()
# Create a dataframe containing only those genes
ml_fil_df = ml_df.filter(ml_fil_genes)
umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)
proj_2d = umap_2d.fit_transform(ml_fil_df)
proj_3d = umap_3d.fit_transform(ml_fil_df)
fig_2d = px.scatter(proj_2d, x=0, y=1, color=class_labels_num)
fig_3d = px.scatter_3d(proj_3d, x=0, y=1, z=2, color=class_labels_num)
fig_2d.show()
fig_3d.show()
# Library for t-SNE projects
from sklearn.manifold import TSNE
# Initialize t-SNE object
tsne_2d = TSNE(n_components=2, random_state=0, learning_rate='auto')
tsne_3d = TSNE(n_components=3, random_state=0, learning_rate='auto')
# Fit and transform the data
tsne_proj_2d = tsne_2d.fit_transform(df)
tsne_proj_3d = tsne_3d.fit_transform(df)
tsne_fig_2d = px.scatter(tsne_proj_2d, x=0, y=1, color=class_labels_num)
tsne_fig_3d = px.scatter_3d(tsne_proj_3d, x=0, y=1, z=2, color=class_labels_num)
tsne_fig_2d.show()
tsne_fig_3d.show()
/Users/arifs2/opt/anaconda3/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:780: FutureWarning: The default initialization in TSNE will change from 'random' to 'pca' in 1.2. /Users/arifs2/opt/anaconda3/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:780: FutureWarning: The default initialization in TSNE will change from 'random' to 'pca' in 1.2.
# Fit and transform the data
tsne_proj_2d = tsne_2d.fit_transform(df_filtered)
tsne_proj_3d = tsne_3d.fit_transform(df_filtered)
tsne_fig_2d = px.scatter(tsne_proj_2d, x=0, y=1, color=class_labels_num)
tsne_fig_3d = px.scatter_3d(tsne_proj_3d, x=0, y=1, z=2, color=class_labels_num)
tsne_fig_2d.show()
tsne_fig_3d.show()
/Users/arifs2/opt/anaconda3/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:780: FutureWarning: The default initialization in TSNE will change from 'random' to 'pca' in 1.2. /Users/arifs2/opt/anaconda3/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:780: FutureWarning: The default initialization in TSNE will change from 'random' to 'pca' in 1.2.
# Fit and transform the data
tsne_proj_2d = tsne_2d.fit_transform(ml_fil_df)
tsne_proj_3d = tsne_3d.fit_transform(ml_fil_df)
tsne_fig_2d = px.scatter(tsne_proj_2d, x=0, y=1, color=class_labels_num)
tsne_fig_3d = px.scatter_3d(tsne_proj_3d, x=0, y=1, z=2, color=class_labels_num)
tsne_fig_2d.show()
tsne_fig_3d.show()
/Users/arifs2/opt/anaconda3/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:780: FutureWarning: The default initialization in TSNE will change from 'random' to 'pca' in 1.2. /Users/arifs2/opt/anaconda3/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:780: FutureWarning: The default initialization in TSNE will change from 'random' to 'pca' in 1.2.